/*** COMBINES VARIOUS PUBLIC USE DATASETS TO CREATE CAMPUS-YEAR DATASET 2010+ ***/

clear all
set more 1

do "paths.do"

capture log close

log using "$LOGFILES/txtr_1.log", replace


/*** TEA AEIS/TAPR ANNUAL DISTRICT DATA ***/

** read in raw annual data
use "$DATAFILES/TEA/aeis_dist", clear
tab year, m
keep if year>=2005 & year<=2019

** flags
descr dfl*
assert dflyre==""
drop dflyre
tab dflchart, m
 la var dflchart "dist flag: charter operator"

** district accountablity rating
tab year dflalted, m
replace dflalted="1" if dflalted=="Y"
replace dflalted="0" if dflalted=="N"
destring dflalted, replace
 la var dflalted "dist flag: rated under alt ed acct (AEA) procedures"
* Note: confirmed meanings of codes via online academic performance reports by district
* 2004-2011
* TAKS (w/required improvement/exceptions)
* E=exemplary, R=recognized, A=acceptable L=low-performing
* alternative education accountability (AEA) for registered campuses and charters
* 1=AEA:academically acceptable, 2=AEA:academically unacceptable, 3=AEA:not rated-other
* X=not rated-other, I=not rated-data integrity issues
tab d_rating year if year<=2011
tab d_rating dflchart if year<=2011
tab district year if d_rating=="2" & year<=2011
tab district year if d_rating=="3" & year<=2011
tab district year if d_rating=="I" & year<=2011
tab district year if d_rating=="X" & year<=2011 & dflchart==0
tab district year if d_rating=="X" & year<=2011 & dflchart==1
gen dflfail=(d_rating=="2"|d_rating=="L") if year<=2011
replace dflalted=(d_rating=="1"|d_rating=="2"|d_rating=="3") if year<=2011
* no ratings assigned in 2012
assert d_rating=="" if year==2012
replace dflfail=0 if year==2012
assert year==2012 if dflalted==.
* fill in AEA status based on prior year (or, if not present then, use following year)
bys district (year): replace dflalted=dflalted[_n-1] if dflalted==.
bys district (year): replace dflalted=dflalted[_n+1] if dflalted==.
assert dflalted!=.
* 2013-17
* 4 indices (student achievement, student progress, closing gaps, post-sec readiness)
* M=met standard, I=improvement required, A=AEA:met alternative standard
* Q=not rated-data integrity issues, X,Z=not rated
tab d_rating dflalted if year>=2013 & year<=2017
tab district year if d_rating=="Q" & year>=2013 & year<=2017
tab district year if d_rating=="Z" & year>=2013 & year<=2017
tab district year if d_rating=="X" & year>=2013 & year<=2017 & dflalted==0
tab district year if d_rating=="X" & year>=2013 & year<=2017 & dflalted==1
replace dflfail=(d_rating=="I") if year>=2013 & year<=2017
* 2018+
* A=exemplary, B=recognized, C=acceptable, D=in need of improvement, F=unacceptable/improvement required
* A-D classified as met standard
* in 2018, not rated if affected by Hurricane Harvey and would otherwise be rated below A
tab d_rating dflalted if year==2018, m
tab d_rating dflalted if year==2019, m
replace dflfail=(d_rating=="F"|d_rating=="Improvement Required") if year>=2018
tabstat dflfail, by(year)
 la var dflfail "dist flag: rated failing under state accountability system"
drop d_rating

** keep only needed variables
descr
rename district distnum
rename county cntynum
rename cntyname cntyname
rename region dregion
 la var dregion "dist 2-digit ed service center (ESC) region"
rename dpetallc denrtot
* professional staff = teachers + campus admin + central admin + professional support
gen tmp1=(dpsttofc+dpsutofc+dpsstofc+dpsctofc)/dpsptofc
list dpsttofc dpsutofc dpsstofc dpsctofc dpsptofc if tmp1==.
summ tmp1, detail
gen dstfcadmp=dpsctofc/dpsptofc
 la var dstfcadmp "dist staff: % central administrators"
* financial variables only included in annual performance reports through 2011 or 2012
* expenditures and revenues are actual (not budgeted), so are for the prior school year
* the tax variables are for tax year XXXX, if the school year is XXXX-YYYY
summ dpf*
tab year if dpfrloct!=.
tab year if dpfvtott!=.
rename dpf* F*
keep dist* year cnty* dregion dfl* denrtot dstfcadmp F*

save "$WORKING/temp/txtr_1_0", replace


/*** TEA PEIMS ANNUAL FINANCIAL DATA ***/

** read in raw data
use "$DATAFILES/peims_fin", clear
descr

** merge to base dataset
* rename variables to differentiate
rename dpf* PF*
tabstat PF*, by(year)
merge 1:1 distnum year using "$WORKING/temp/txtr_1_0", update
assert _merge<=3
tab year _merge
drop if _merge==1
drop _merge

** check correlations between PEIMS and AEIS/TAPR values
* REVENUES
 descr Fr* PFr*
 * create totals from PEIMS variables to correspond to those in AEIS/TAPR reports
 gen PFrloct=PFrlocop+PFrlocis
 gen PFrstat=PFrstaop+PFrstadbt+PFrstatrs
 gen PFrotht=PFrothop
 gen PFrfedt=PFrfedop
 gen PFrallt=PFrloct+PFrstat+PFrotht+PFrfedt
 tabstat Fr*, by(year)
 * Note: federal recovery & reinvestment funds reallocated by state were counted as state 2010/2011
 for var Fr*: replace X=X/PX
 tabstat Fr*, by(year)
 drop Fr* PFrloct PFrstat PFrotht PFrfedt PFrallt
* EXPENDITURES BY FUNCTION
 descr Fe* PFe*
 tabstat Fe*, by(year)
 for var Fe*: replace X=X/PX
 tabstat Fe*, by(year)
 drop Fe*
* EXPENDITURES BY PROGRAM
 descr Fp* PFp*
 tabstat Fp*, by(year)
 * total program operating expenditures excluded unallocated expenses
 rename PFpallt HOLD
 gen double PFpallt=HOLD-PFpexcl
 for var Fp*: capture replace X=X/PX
 tabstat Fp*, by(year)
 drop Fp* PFpallt
 rename HOLD PFpallt
* STANDARDIZED TAX BASE WEALTH
 descr Fvtott PFvtott
 tabstat Fvtott, by(year)
 replace Fvtott=Fvtott/PFvtott
 tabstat Fvtott, by(year)
 drop Fvtott
* TAX BASE COMPONENTS
 * these are only available in AEIS/TAPR
 * exceed total standardized tax property tax base due to homestead exemptions, etc.
 descr Fv*
 gen tmp=(Fvbust+Fvlant+Fvoilt+Fvotht+Fvrest)/PFvtott
 summ tmp, detail
 gen dfvresp=Fvrest/(Fvbust+Fvlant+Fvoilt+Fvotht+Fvrest)
  la var dfvresp "dist fin: property value residential share"
 drop Fv* tmp
rename PF* df*

** adjust timing of financial variables
sort distnum year
* tax base wealth is from the prior year final tax year (e.g., 2015 for 2015-16, which is used for 2016-17 funding)
foreach var of varlist dfv*{
 bys distnum (year): gen double tmp=`var'[_n-1]
 replace `var'=tmp
 drop tmp
}
* revenues/expenditures are from the prior year (e.g., 2015-16 for 2016-17)
foreach var of varlist dfr* dfe* dfp*{
 bys distnum (year): gen double tmp=`var'[_n+1]
 replace `var'=tmp
 drop tmp
}
drop dfrlocis dfrstadbt dfrothrec dfrothrev dfrlocequ dfrdbt dfrstatrs dfrall

** adjust naming of financial variables so is clear are totals
descr dfr* dfe* dfp*
rename dfr*op dfr*t
rename dfpexcl dfpexclt

** create any needed lagged variables and drop years prior to 2010
sort distnum year
forval t=1/5{
 gen dflfail_`t'=dflfail[_n-`t'] if distnum[_n-`t']==distnum[_n-`t'] & year==year[_n-`t']+`t'
 la var dflfail_`t' "dist flag: rated failing in t-`t'"
}
drop if year<2010

save "$WORKING/temp/txtr_1_1", replace


/*** TEA AEIS/TAPR ANNUAL CAMPUS DATA ***/

** read in raw data
use "$DATAFILES/TEA/aeis_camp", clear
tab year, m
keep if year>=2005 & year<=2019

** flags
descr campus campname county region grdtype grdspan cfl*
rename grdtype cegrdtype
rename grdspan cegrdspan
 la var cegrdspan "camp grade span"
* drop 2005 campuses with no student information
summ if cegrdtype==""
assert cpetall==. if cegrdtype==""
drop if cegrdtype==""
tab cegrdspan cegrdtype, m
tab cflchart, m
 la var cflchart "camp flag: charter school"
drop county region

** campus accountability rating
* if no tested grades, paired with another feeder campus for accountability
descr pair*
tab cegrdspan if paircamp!=.
tab year cflalted, m
 la var cflalted "camp flag: rated under AEA procedures"
* 2004-2011
tab c_rating year if year<=2011
tab c_rating cflchart if year<=2011
* most not rated have too few students in tested grades
gen tmp1=cpetg03c+cpetg04c+cpetg05c+cpetg06c+cpetg07c+cpetg08c+cpetg09c+cpetg10c+cpetg11c
summ tmp1 if c_rating=="X" & year<=2011, detail
gen cflfail=(c_rating=="2"|c_rating=="L") if year<=2011
replace cflalted=(c_rating=="1"|c_rating=="2"|c_rating=="3") if year<=2011
* no ratings assigned in 2012
assert c_rating=="" if year==2012
replace cflfail=0 if year==2012
assert year==2012 if cflalted==.
bys campus (year): replace cflalted=cflalted[_n-1] if cflalted==.
bys campus (year): replace cflalted=cflalted[_n+1] if cflalted==.
replace cflalted=0 if cflalted==.
* 2013-17
tab c_rating cflalted if year>=2013 & year<=2017
* T=not rated-annexation
list campus year if c_rating=="T" & year>=2013 & year<=2017
summ tmp1 if c_rating=="X" & year>=2013 & year<=2017, detail
summ tmp1 if c_rating=="Z" & year>=2013 & year<=2017, detail
replace cflfail=(c_rating=="I") if year>=2013 & year<=2017
* 2018+
* A-F not assigned to campuses until 2019, just whether met standard or not
tab c_rating cflalted if year==2018, m
tab c_rating cflalted if year==2019, m
replace cflfail=(c_rating=="F"|c_rating=="Improvement Required") if year>=2018
tabstat cflfail, by(year)
 la var cflfail "camp flag: rated failing under state accountability system"
drop pair* c_rating tmp*

** campus financial variables only available in AEIS/TAPR through 2011
descr cpf*
tabstat cpf*, by(year)
drop cpf*

** student counts
descr cpet*
summ cpet*
* by grade
gen tmp=cpetgeec+cpetgpkc+cpetgknc
forvalues x=1/9{
 replace tmp=tmp+cpetg0`x'c
}
forvalues x = 10/12{
 replace tmp=tmp+cpetg`x'c
}
replace tmp=tmp/cpetallc
assert tmp==1
* by race/ethnicity
tab year if cpettwoc!=.
replace tmp=cpetwhic+cpetblac+cpethisc+cpetindc+cpetpacc
replace tmp=tmp+cpettwoc if cpettwoc!=.
replace tmp=tmp/cpetall
assert tmp==1
gen cpetothc=cpetallc-cpetwhic-cpetblac-cpethisc-cpetpacc
 la var cpetothc "camp student: other (incl multiple) race/ethnicity count"
* by program
for any spe bil voc lep eco rsk: replace tmp=cpetXc/cpetallc \ assert tmp<=1 if tmp!=.
summ cpetdisp, detail
tab year if cpetdisp>1 & cpetdisp!=.
replace cpetdisp=. if cpetdisp>1
drop cpetindc cpettwoc cpetdisp tmp*
rename cpetallc cenrtot
rename cpet* ce*
* mobility
descr cpemallp
rename cpemallp cemobp

** create student shares
descr ce*c
foreach i of varlist ce*c{
 gen `i'p=`i'/cenrtot
 local a : variable label `i'
 local a : subinstr local a "count" "pct"
 label var `i'p "`a'"
}
rename ce*cp ce*p

** teacher counts
descr cpst*
summ cpst*
gen tmp=(cpst00fc+cpst01fc+cpst06fc+cpst11fc+cpst20fc)/cpsttofc
summ tmp, detail
replace tmp=cpstwhfc+cpstblfc+cpsthifc+cpstinfc+cpstpafc
replace tmp=tmp+cpsttwfc if cpsttwfc!=.
replace tmp=tmp/cpsttofc
summ tmp, detail
replace tmp=(cpstmafc+cpstfefc)/cpsttofc
summ tmp, detail
replace tmp=(cpstrefc+cpstvofc+cpstbifc+cpstcofc+cpstgifc+cpstspfc+cpstopfc)/cpsttofc
summ tmp, detail
summ cpsttena cpstexpa, detail
drop cpst00fc cpst01fc cpst06fc cpst11fc cpst20fc
drop cpstwhfc cpstblfc cpsthifc cpstinfc cpstpafc cpsttwfc
drop cpstmafc cpstfefc tmp*
rename cpst* ct*

** other staff counts
descr cps*
summ cps*
rename cps* cs*

** class sizes
descr cpct*
summ cpct*
drop cpctenga cpctscia cpctsoca cpctflaa
rename cpct* cc*

** create any needed lagged variables and drop years prior to 2010
sort campus year
forval t=1/5{
 gen cflfail_`t'=cflfail[_n-`t'] if campus==campus[_n-`t'] & year==year[_n-`t']+`t'
 la var cflfail_`t' "camp flag: rated failing in t-`t'"
}
drop if year<2010

** merge to base district-level dataset
list campus in 1
gen distnum=floor(campus/1000)
summ distnum
merge n:1 distnum year using "$WORKING/temp/txtr_1_1", update
assert _merge==3
drop _merge

save "$WORKING/temp/txtr_1_2", replace


/*** COUNTY-LEVEL PRESIDENTIAL ELECTION DATA ***/

** read in raw data
import delimited "$DATAFILES/countypres_2000-2016.csv", clear
descr
list in 1
assert version==20191203
tab party, m
replace party="other" if inlist(party,"green","NA")
replace candidatevotes = "0" if (candidatevotes=="NA")
destring candidatevotes, force replace

** sum votes across other party candidates
duplicates tag year state state_po county fips party totalvotes, gen(tmp)
tab tmp
tab party if tmp==1
collapse (sum) candidatevotes, by(year state state_po county fips party totalvotes)

** reshape to one observation per county-year
reshape wide candidatevotes, i(year state state_po county fips totalvotes) j(party) string
rename candidatevotesdemocrat cntyvotd
rename candidatevotesrepublican cntyvotr
rename candidatevotesother cntyvoto

** keep Texas only
keep if state=="Texas"
drop state*

** clean variables
rename county cntyname
rename fips cntyfips
destring cntyfips, replace
gen totalvotes0 = cntyvotd + cntyvotr + cntyvoto
assert totalvotes0==totalvotes
drop totalvotes*

** calculate D share of the two-party vote
gen cntydemp=cntyvotd/(cntyvotd+cntyvotr)
summ cntydemp
egen cntydempa=mean(cntydemp), by(cntyfips)

** convert to wide so can use vote shares from any year as county-level static variables
keep cntyname cntyfips year cntydem*
reshape wide cntydemp, i(cntyname cntyfips cntydempa) j(year)
foreach num of numlist 2000 2004 2008 2012 2016{
 la var cntydemp`num' "county 2-party Democratic vote share in `num'"
}
la var cntyname "county name"
la var cntyfips "county FIPS code"
la var cntydempa "county average 2-party Democratic vote share, 2000-2016 elections"

** merge to base dataset
replace cntyname = upper(cntyname)
duplicates report cntyname
merge 1:n cntyname using "$WORKING/temp/txtr_1_2.dta", update
assert _merge<=3
* Note: Loving county is the 2nd-least populous county in U.S. in 2017 (134 residents)
list cnty* if _merge==1
drop if _merge==1
drop _merge
save "$WORKING/temp/txtr_1_3", replace


/*** COMMON CORE OF DATA (CCD) DISTRICT DATA ***/

** Note: geographic associations from the CCD are based on location of administrative office

** read in raw data
use "$DATAFILES/tx_ccd_district.dta", clear
descr year
gen tmp=year+1
drop year
rename tmp year
keep if year>=2010
tab year, m

** rename needed variables
keep leaid state_leaid year latitude longitude urban_centric_locale ///
 congress_district_id state_leg_district_lower state_leg_district_upper ///
 agency_type boundary_change_indicator enrollment
rename leaid dfed_leaid
destring dfed_leaid, replace
rename state_leaid distnum
replace distnum = subinstr(distnum,"TX-","", .)
destring distnum, replace
assert dfed_leaid!=. & distnum!=.
rename congress_district_id dgcongr
 la var dgcongr "dist geopol: state and 114th congressional district id"
rename latitude dglat
rename longitude dglon
 la var dglat "dist geopol: latitude"
 la var dglon "dist geopol: longitude"
destring state_leg_district_lower, replace
destring state_leg_district_upper, replace
rename state_leg_district* dgstleg*
 la var dgstleg_lower "dist geopol: state legislative district-lower"
 la var dgstleg_upper "dist geopol: state legislative district-upper"
descr, full

** urban/rural status
* note: each of 3 locale types is divided into 3 subtypes according to population size
* (for city/suburban) and proximity to urban areas (for town/rural)
assert urban_centric_locale!=.
tab urban_centric_locale
tab urban_centric_locale, nol
gen dgsuburb=(urban_centric_locale>=21 & urban_centric_locale<=23)
gen dgtown=(urban_centric_locale>=31 & urban_centric_locale<=33)
gen dgrural=(urban_centric_locale>=41 & urban_centric_locale<=43)
 la var dgsuburb "dist geopol: suburban (outside principal city/inside urbanized area)"
 la var dgtown "dist geopol: town (inside urban cluster/outside urbanized area)"
 la var dgrural "dist geopol: rural (rural territory)"
drop urban_centric_locale

** merge to base dataset
merge 1:n distnum year using "$WORKING/temp/txtr_1_3.dta", update
assert _merge==1|_merge==3
tab year agency_type if _merge==1, m
tab year boundary if _merge==1, m
summ enrollment if _merge==1, detail
drop if _merge==1
drop _merge

** cross-check overlapping variables
tab agency_type dflchart, m
summ enrollment
gen tmp=enrollment/denrtot if enrollment>=0
summ tmp, detail
drop agency_type enrollment tmp*

** boundary changes
tab boundary_change_indicator, m
tab boundary_change_indicator, nol
gen dflbound=(boundary_change_indicator==5) if boundary_change_indicator!=.
gen dflnew=(boundary_change_indicator==3) if boundary_change_indicator!=.
 la var dflbound "dist flag: significant change in geographical boundary (CCD)"
 la var dflnew "dist flag: district is new (CCD)"
drop boundary_change_indicator

save "$WORKING/temp/txtr_1_4", replace


/*** COMMON CORE OF DATA (CCD) SCHOOL DATA ***/

** read in raw data
use "$DATAFILES/tx_ccd_school.dta", clear
descr year
gen tmp=year+1
drop year
rename tmp year
keep if year>=2010
tab year, m

** rename needed variables
keep year seasch latitude longitude ///
 congress_district_id state_leg_district_lower state_leg_district_upper ///
 school_status title_i_status title_i_eligible title_i_schoolwide ///
 charter magnet shared_time virtual ///
 lunch_program free_lunch reduced_price_lunch free_or_reduced_price_lunch ///
 enrollment
rename seasch campus
replace campus=regexr(campus,"[0-9]+-","")
destring campus, replace
rename congress_district_id cgcongr
 la var cgcongr "camp geopol: state and 114th congressional district id"
rename latitude cglat
rename longitude cglon
 la var cglat "camp geopol: latitude"
 la var cglon "camp geopol: longitude"
destring state_leg_district_lower, replace
destring state_leg_district_upper, replace
rename state_leg_district* cgstleg*
 la var cgstleg_lower "camp geopol: state legislative district-lower"
 la var cgstleg_upper "camp geopol: state legislative district-upper"
descr, full

** merge to base dataset
merge 1:1 campus year using "$WORKING/temp/txtr_1_4.dta", update
assert _merge<=3
tab year _merge
tab school_status _merge
tab school_status, nol
summ enrollment if _merge==1 & school_status==1, detail
drop if _merge==1
drop _merge

** cross-check overlapping variables
tab charter cflchart, m
gen tmp=enrollment/cenrtot if enrollment>=0
summ tmp, detail
drop charter enrollment tmp*

** create flags
tab school_status, m
tab school_status, nol
gen cflnew=(school_status==3) if school_status!=.
gen cflchgd=(school_status==5) if school_status!=.
 la var cflnew "camp flag: campus is new (CCD)"
 la var cflchg "camp flag: campus changed agency (CCD)"
/* fields for Title I status:
 1 School is eligible for Title I Targeted Assistance (TAS) but provides no program
 2 School is eligible for Title I Targeted Assistance (TAS) and provides TAS program
 3 School is eligible for Title I Schoolwide program (SWP) and provides Targeted Assistance program (TAS)
 4 School is eligible for Title I Schoolwide program (SWP) but provides no program
 5 School is eligible for Title I Schoolwide program (SWP) and provides Title I Schoolwide program
 6 School is not eligible for either Title I Targeted Assistance or Schoolwide program
*/
tab title_i_status title_i_eligible, m
tab title_i_status title_i_schoolwide, m
tab title_i_status, nol
gen cflt1eltas=(title_i_status==1|title_i_status==2) if title_i_status!=.
gen cflt1elswp=(title_i_status>=3&title_i_status<=5) if title_i_status!=.
gen cflt1prtas=(title_i_status==2|title_i_status==3) if title_i_status!=.
gen cflt1prswp=(title_i_status==5) if title_i_status!=.
 la var cflt1eltas "camp flag: eligible for Title I targeted assistance (CCD)"
 la var cflt1elswp "camp flag: eligible for Title I schoolwide program (CCD)"
 la var cflt1prtas "camp flag: provides Title I targeted assistance (CCD)"
 la var cflt1prswp "camp flag: provides Title I schoolwide program (CCD)"
tab year magnet, m
tab magnet, nol
gen cflmagnet=(magnet==1) if magnet==0|magnet==1
 la var cflmagnet "camp flag: magnet school (CCD)"
tab shared_time, m
tab year virtual, m
tab virtual, nol
tab campname if virtual==1
gen cflvirtual=(virtual==1) if virtual==0|virtual==1
 la var cflvirtual "camp flag: virtual school (CCD)"
tab lunch_program, m
* -3=suppressed, -2=not applicable, -1=missing/not reported
foreach var of varlist free_lunch reduced_price_lunch free_or_reduced_price_lunch{
 tab `var' if `var'<0
 tab lunch_program if `var'<0, m
 replace `var'=. if `var'<0
}
summ *lunch*
gen tmp=(free_lunch+reduced_price_lunch)/free_or_reduced_price_lunch
assert tmp==1|tmp==.
replace tmp=free_or_reduced_price_lunch/ceecoc
summ tmp, detail
rename free_lunch celunchfrc
rename reduced_price_lunch celunchredc
 la var celunchfrc "camp student: free lunch eligible count (CCD)"
 la var celunchredc "camp student: reduced price lunch eligible count (CCD)"
drop school_status title_i* magnet shared_time virtual lunch_program free_or* tmp*

save "$WORKING/temp/txtr_1_5", replace


/*** DISTRICT CENSUS/ACS DISSIMILARITY INDICES (DERIVED FROM BLOCK GROUP DATA) ***/

** read in data
use "$DATAFILES/txbg_3_dissim", clear
descr
summ
drop distname

** explore how stable dissimilarity indices are across years
tab year, m
xtset fed_leaid
for var dbg_D*: xtreg X, fe

** collapse to time-invariant dataset
foreach var of varlist dbg* {
 egen tmp=mean(`var'), by(fed_leaid)
 replace `var'=tmp
 drop tmp
}
bys fed_leaid: gen tmp=_n
keep if tmp==1
drop year tmp

** rename and label variables
rename dflag_exit dbg_flexit
foreach i of varlist dbg* {
 local a : variable label `i'
 local a : subinstr local a "to district" ""
 local a : subinstr local a "district " ""
 local a : subinstr local a "estimated " ""
 local a : subinstr local a ", by summing across BGs" ""
 label var `i' "dist ACS: `a'"
}
la var dbg_num "dist ACS: number of block groups (BGs)"
foreach i of varlist dbg_pop dbg_pmin dbg_Dmin dbg_phied dbg_Ded dbg_ppov dbg_Dpov {
 local a : variable label `i'
 label var `i' "`a' (2010-2016 avg)"
}

** merge to base dataset
rename fed_leaid dfed_leaid
merge 1:n dfed_leaid using "$WORKING/temp/txtr_1_5.dta", update
assert _merge<=3
* note: mapping is only done for geographically defined schools districts
* so, e.g., will not cover charter districts
tab dbg_flexit if _merge==1, m
tab dflchart dflalted if _merge==2, m
drop if _merge==1
drop _merge

save "$WORKING/temp/txtr_1_6.dta", replace


/*** EDGE DISTRICT CENSUS/ACS DATA ***/

** read in raw data
import delimited "$DATAFILES/NCES-Edge Clean.csv", clear
gen year=yearstart+2
rename leaid dfed_leaid
keep if year>=2010
tab year
summ

** child-based variables
descr chienrl* chipubl*, full
foreach x in enrl publ{
 replace chi`x'_medfaminc="" if chi`x'_medfaminc=="-"
 replace chi`x'_medfaminc="250000" if chi`x'_medfaminc=="250,000+"
 destring chi`x'_medfaminc, replace
 tabstat chi`x'*, by(year)
 gen tmp=(chi`x'_hhtypefamcouple+chi`x'_hhtypefamfemale1+chi`x'_hhtypefammale1)/chi`x'_hhtypefam
 summ tmp, detail
 replace tmp=(chi`x'_hhtypefam/chi`x'_numhouseholds)
 summ tmp, detail
 drop tmp
}
* % single-parent among families
gen dedge_sgpar=(chienrl_hhtypefamfemale1+chienrl_hhtypefammale1)/chienrl_hhtypefam
 la var dedge_sgpar "dist EDGE: share of pub sch families headed by single parent"
* % familes w/enrolled children in private school
gen dedge_priv=(chienrl_hhtypefam-chipubl_hhtypefam)/chienrl_hhtypefam
 la var dedge_priv "dist EDGE: share families w/kids enr in priv sch"
* median family income
rename chipubl_medfaminc dedge_medfaminc
 la var dedge_medfaminc "dist EDGE: median public sch family income"
* SNAP receipt
rename chipubl_snapreceipt dedge_snap
 la var dedge_snap "dist EDGE: share of public sch families w/SNAP"
summ dedge*, detail

** parent based variables
descr all_*, full
summ all_*
rename all_educbachplus dedge_bachplus
 la var dedge_bachplus "dist EDGE: share of people w/BA or higher"
rename chienrl_poverty dedge_poverty
 la var dedge_poverty "dist EDGE: share of enrolled sch households in poverty"
// corr parpubl_unemp*
rename all_unemployed dedge_unemp
 la var dedge_unemp "dist EDGE: share of labor force age 16+ unemployed"

** create SES index
* Reardon (2019): 1st principal component of 6 measures for families w/children enrolled in public schools
* 1) median family income 2) % adults BA or higher 3) poverty rate 4) unemp rate
* 5) SNAP eligibility rate 6) % families headed by single parent
* the measure is standardized to have a mean of 0 and standard deviation of 1 across districts
tabstat dedge*, by(year)
for var dedge*: tab year if X==.
pca dedge_medfaminc dedge_bachplus dedge_poverty dedge_unemp dedge_snap dedge_sgpar
predict pc1, score
egen tmp1=mean(pc1)
egen tmp2=sd(pc1)
gen dedge_ses1=(pc1-tmp1)/tmp2
 la var dedge_ses1 "dist EDGE: SES standardized index (6 vars)"
drop pc1 tmp*
* create alternative that uses subset that is less frequently missing
pca dedge_medfaminc dedge_snap dedge_sgpar
predict pc1, score
egen tmp1=mean(pc1)
egen tmp2=sd(pc1)
gen dedge_ses2=(pc1-tmp1)/tmp2
 la var dedge_ses2 "dist EDGE: SES standardized index (3 vars)"
corr dedge_ses*

** collapse to average across years
keep dfed_leaid dedge*
foreach var of varlist dedge* {
 egen tmp=mean(`var'), by(dfed_leaid)
 replace `var'=tmp
 drop tmp
}
bys dfed_leaid: gen tmp=_n
keep if tmp==1
drop tmp

** adjust labels
foreach i of varlist dedge* {
 local a : variable label `i'
 label var `i' "`a' (2010-2016 avg)"
}
descr

** merge to base dataset
merge 1:n dfed_leaid using "$WORKING/temp/txtr_1_6.dta", update
assert _merge<=3
tab dflchart dflalted if _merge==2, m
drop if _merge==1
drop _merge

save "$WORKING/temp/txtr_1_7", replace


/*** PRIVATE SCHOOL SURVEY COUNTY-LEVEL DATA ***/

use "$DATAFILES/tx_pss", clear
descr
rename prisch_* cntypri_*
tab year
merge 1:n cntyfips year using "$WORKING/temp/txtr_1_7.dta", update
assert _merge==2|_merge==3
tab year _merge
drop _merge

qui compress
order campus campname distnum distname dfed_leaid year dregion ///
 dfl* dg* denrtot dfr* dfe* dfp* dfv* dstfcadmp dbg* dedge* cnty* ///
 cfl* cg* cenrtot ce* cs* ct* cc*a
la var dfed_leaid "district identification number (federal)"
la var distnum "district identification number (state)"
foreach i of varlist de* ce*{
 local a : variable label `i'
 local a : subinstr local a "camp student" "camp stud"
 local a : subinstr local a "dist student" "dist stud"
 label var `i' "`a'"
}
for var *glat *glon *congr *stleg* celunch*: label values X

descr
summ
save "$WORKING/txtr_1", replace

log close
